import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
from scipy import stats
from scipy.stats import spearmanr
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
df = pd.read_csv("D:/מבוא למדעי הנתונים/מטלה 1/death_prediction_synthetic.csv")
df.shape
(12438, 45)
pd.set_option('display.max_columns', None)
df.head(20)
| age | sex | marital_status | ses | residence_cd | residence | weigh | heigh | BMI | bp_sys | bp_dias | bp_cat | smoking | smoking_status | HbA1c | glucose | creatinin | albumin | alb24h | ACR | cholesterol_total | LDL | HDL | triglycerides | TSH | gravity_u | nitrites_u | leuko_u | proteinuria | WBC | RBC | platelets | MCV | MPV | Charlson | framingham_cvd | antidiabetics | ERD | CVD | HTN | cancer | cardiovascular_meds | statines | immigrant | dead_5y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | 2 | U | 1 | 1 | urban | 84.0 | 160.0 | 32.79 | 116 | 66 | Normal | 1 | non_smoker | 11.0 | 238 | 0.58 | 3.94 | 32.0 | 6.00 | 211.6 | 114.0 | 39.0 | 373 | 12.5 | 1.025 | 0.0 | 0.0 | 0.0 | 10.60 | 5.11 | 295 | 87.9 | 7.7 | 1 | 0.12908 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
| 1 | 46 | 2 | M | 1 | 1 | urban | 67.0 | 152.0 | 29.38 | 113 | 74 | Normal | 1 | non_smoker | 7.0 | 119 | 0.65 | 4.50 | 17.9 | 3.87 | 154.0 | 79.0 | 41.0 | 204 | 13.5 | 1.021 | 0.0 | 0.0 | 0.0 | 8.70 | 4.45 | 204 | 78.1 | 8.2 | 1 | 0.08500 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 2 | 46 | 2 | U | 1 | 1 | urban | 62.4 | 148.0 | 28.31 | 133 | 87 | Pre-HTN | 1 | non_smoker | 11.8 | 202 | 0.59 | NaN | NaN | 6.35 | 252.0 | 65.0 | 57.0 | 272 | 13.8 | 1.030 | 0.0 | 0.0 | 0.0 | 11.50 | 5.73 | 772 | 85.1 | 10.2 | 1 | 0.10715 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
| 3 | 45 | 1 | U | 2 | 1 | urban | 73.9 | 166.0 | 26.85 | 100 | 63 | Normal | 3 | current_smoker | 6.9 | 203 | 0.70 | 4.60 | 9.1 | 5.00 | 149.0 | 92.0 | 37.0 | 84 | 12.6 | 1.017 | 0.0 | 1.0 | 0.0 | 9.10 | 5.06 | 299 | 84.6 | 9.1 | 1 | 0.10110 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 49 | 2 | U | 1 | 1 | urban | 53.0 | 161.0 | 20.45 | 110 | 70 | Normal | 1 | non_smoker | 8.7 | 152 | 0.75 | 4.40 | NaN | 0.48 | 178.0 | 94.8 | 57.0 | 140 | 11.9 | 1.028 | 0.0 | 0.0 | 0.0 | 6.10 | 4.64 | 181 | 89.8 | 8.9 | 1 | 0.03950 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 5 | 41 | 1 | M | 1 | 1 | urban | 107.0 | 193.0 | 28.86 | 115 | 80 | Normal | 3 | current_smoker | 6.9 | 143 | 0.90 | 4.80 | NaN | 5.00 | 187.0 | 112.5 | 31.0 | 182 | 12.1 | 1.025 | 0.0 | 0.0 | 0.0 | 10.60 | 5.20 | 313 | 82.2 | 10.1 | 1 | 0.15690 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 6 | 46 | 2 | U | 2 | 1 | urban | 89.0 | 172.0 | 30.27 | 123 | 84 | Normal | 3 | current_smoker | 5.7 | 134 | 0.89 | 4.60 | NaN | 5.00 | 140.0 | 79.9 | 43.0 | 68 | 16.1 | 1.020 | 0.0 | 0.0 | 0.0 | 7.30 | 4.20 | 290 | 80.6 | 9.1 | 1 | 0.12122 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 7 | 47 | 1 | M | 2 | 1 | urban | 85.0 | 175.0 | 27.92 | 120 | 80 | Normal | 1 | non_smoker | 7.3 | 152 | 0.96 | 4.60 | 45.0 | 30.00 | 158.6 | 87.7 | 31.0 | 187 | 13.2 | NaN | NaN | NaN | NaN | 9.50 | 4.96 | 307 | 86.7 | 9.3 | 2 | 0.11748 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 8 | 42 | 1 | M | 1 | 1 | urban | 87.0 | 175.0 | 28.41 | 110 | 60 | Normal | 1 | non_smoker | 6.4 | 117 | 0.74 | 4.40 | NaN | 4.16 | 180.0 | 117.9 | 43.0 | 111 | 12.2 | 1.025 | 0.0 | 0.0 | 0.0 | 7.60 | 5.34 | 215 | 83.7 | 10.4 | 1 | 0.04838 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | 44 | 2 | M | 1 | 1 | urban | 72.0 | 164.0 | 26.77 | 137 | 76 | Pre-HTN | 1 | non_smoker | 6.1 | 55 | 0.49 | 4.25 | NaN | 3.14 | 214.0 | 98.0 | 80.0 | 76 | 11.8 | 1.011 | 0.0 | 0.0 | 0.0 | 9.96 | 4.25 | 396 | 89.1 | 7.5 | 2 | 0.09586 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10 | 33 | 1 | M | 1 | 1 | urban | 112.0 | 185.0 | 32.72 | 130 | 80 | Pre-HTN | 1 | non_smoker | 6.2 | 94 | 1.07 | NaN | NaN | 5.00 | 172.8 | 105.0 | 44.0 | 104 | 13.2 | 1.015 | 0.0 | 0.0 | 0.0 | 10.90 | 5.29 | 199 | 84.9 | 9.9 | 1 | 0.03946 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
| 11 | 45 | 1 | M | 1 | 1 | urban | 90.0 | 180.0 | 27.78 | 108 | 69 | Normal | 3 | current_smoker | 6.6 | 95 | 0.72 | 4.30 | NaN | 3.00 | 148.7 | 73.0 | 31.0 | 237 | 13.6 | NaN | NaN | NaN | NaN | 8.60 | 5.04 | 275 | 86.6 | 8.3 | 1 | 0.15619 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 12 | 44 | 2 | M | 1 | 1 | urban | 60.0 | 158.0 | 23.85 | 106 | 70 | Normal | 1 | non_smoker | 12.4 | 344 | 0.54 | 4.11 | NaN | NaN | 322.0 | 111.5 | 65.0 | 336 | 14.1 | NaN | NaN | NaN | NaN | 6.49 | 4.80 | 331 | 81.8 | 8.0 | 1 | 0.08461 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 13 | 43 | 1 | M | 1 | 1 | urban | 98.5 | 186.0 | 28.60 | 129 | 84 | Normal | 3 | current_smoker | 12.0 | 118 | 0.72 | 4.40 | NaN | 15.50 | 174.2 | 118.8 | 40.0 | 76 | 14.1 | 1.010 | 0.0 | 0.0 | 0.0 | 10.90 | 5.23 | 179 | 89.7 | 9.8 | 1 | 0.23632 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | 48 | 1 | M | 1 | 1 | urban | 64.5 | 177.0 | 20.83 | 115 | 65 | Normal | 2 | past_smoker | 7.3 | 76 | 0.69 | 4.66 | 10.0 | 10.00 | 176.5 | 123.0 | 38.0 | 61 | 12.0 | 1.023 | 0.0 | 0.0 | 0.0 | 7.40 | 5.00 | 339 | 88.0 | 9.9 | 1 | 0.11165 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 15 | 45 | 2 | M | 2 | 1 | urban | 55.0 | 156.0 | 22.60 | 100 | 60 | Normal | 1 | non_smoker | 8.1 | 118 | 0.54 | 4.66 | 7.0 | 65.92 | 190.0 | 126.2 | 60.0 | 87 | 13.2 | 1.015 | 0.0 | 0.0 | 0.0 | 5.10 | 4.60 | 203 | 86.2 | 8.8 | 2 | 0.08174 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 16 | 46 | 1 | U | 2 | 1 | urban | 100.0 | 179.0 | 31.21 | 125 | 74 | Normal | 3 | current_smoker | 6.2 | 119 | 0.90 | 4.50 | NaN | 3.19 | 136.0 | 74.0 | 35.0 | 118 | 13.0 | 1.010 | 0.0 | 0.0 | 0.0 | 12.22 | 4.89 | 283 | 80.4 | 8.8 | 1 | 0.18142 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 17 | 49 | 2 | M | 2 | 1 | urban | 78.0 | 157.5 | 31.24 | 130 | 89 | Pre-HTN | 1 | non_smoker | 6.1 | 129 | 0.57 | 4.30 | 45.0 | 123.34 | 183.9 | 96.9 | 40.0 | 220 | 12.5 | 1.033 | 0.0 | 0.0 | 0.0 | 7.40 | 4.49 | 318 | 85.6 | 8.8 | 1 | 0.11276 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 0 |
| 18 | 39 | 1 | M | 2 | 1 | urban | 105.0 | 174.0 | 35.01 | 129 | 84 | Normal | 2 | past_smoker | 12.6 | 110 | 0.92 | 4.50 | NaN | 58.00 | 178.0 | 117.0 | 33.0 | 103 | 13.0 | NaN | NaN | NaN | NaN | 11.80 | 5.98 | 328 | 79.5 | 8.0 | 3 | 0.07689 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
| 19 | 45 | 1 | U | 2 | 1 | urban | 83.0 | 169.0 | 29.06 | 117 | 86 | Pre-HTN | 1 | non_smoker | 8.6 | 253 | 0.88 | 4.60 | 26.5 | 8.59 | 133.0 | 50.1 | 45.0 | 183 | 11.2 | 1.025 | 0.0 | 0.0 | 0.0 | 7.50 | 5.08 | 295 | 85.4 | 12.1 | 1 | 0.08349 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
df.cancer.unique()
array([0], dtype=int64)
df.drop(['residence_cd', 'smoking', 'cancer'], axis = 1, inplace=True)
df.duplicated().sum()
0
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12438 entries, 0 to 12437 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 12438 non-null int64 1 sex 12438 non-null int64 2 marital_status 12438 non-null object 3 ses 12438 non-null int64 4 residence 12438 non-null object 5 weigh 12438 non-null float64 6 heigh 12438 non-null float64 7 BMI 12438 non-null float64 8 bp_sys 12438 non-null int64 9 bp_dias 12438 non-null int64 10 bp_cat 12438 non-null object 11 smoking_status 12438 non-null object 12 HbA1c 12438 non-null float64 13 glucose 12438 non-null int64 14 creatinin 12438 non-null float64 15 albumin 10975 non-null float64 16 alb24h 3520 non-null float64 17 ACR 11226 non-null float64 18 cholesterol_total 12438 non-null float64 19 LDL 12438 non-null float64 20 HDL 12438 non-null float64 21 triglycerides 12438 non-null int64 22 TSH 12438 non-null float64 23 gravity_u 9908 non-null float64 24 nitrites_u 9918 non-null float64 25 leuko_u 9931 non-null float64 26 proteinuria 9932 non-null float64 27 WBC 12438 non-null float64 28 RBC 12438 non-null float64 29 platelets 12438 non-null int64 30 MCV 12438 non-null float64 31 MPV 12438 non-null float64 32 Charlson 12438 non-null int64 33 framingham_cvd 12438 non-null float64 34 antidiabetics 12438 non-null int64 35 ERD 12438 non-null int64 36 CVD 12438 non-null int64 37 HTN 12438 non-null int64 38 cardiovascular_meds 12438 non-null int64 39 statines 12438 non-null int64 40 immigrant 12438 non-null int64 41 dead_5y 12438 non-null int64 dtypes: float64(21), int64(17), object(4) memory usage: 4.0+ MB
#dividing the variables into numirical and categorical
num_vars = ['age','weigh','heigh','BMI','bp_sys','bp_dias','HbA1c','glucose','creatinin','albumin','alb24h','ACR','cholesterol_total','LDL','HDL','triglycerides','TSH','gravity_u','WBC','RBC','platelets','MCV','MPV','framingham_cvd']
cat_vars = [col for col in df.columns if col not in num_vars]
num_vars_len = len(num_vars)
cat_vars_len = len(cat_vars)
#change the categorical variables to categoy type
for column in df[cat_vars]:
df[column] = df[column].astype('category')
print("The length of the numerical variables is:", len(num_vars))
print("The length of the categorical variables is:", len(cat_vars))
The length of the numerical variables is: 24 The length of the categorical variables is: 18
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12438 entries, 0 to 12437 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 12438 non-null int64 1 sex 12438 non-null category 2 marital_status 12438 non-null category 3 ses 12438 non-null category 4 residence 12438 non-null category 5 weigh 12438 non-null float64 6 heigh 12438 non-null float64 7 BMI 12438 non-null float64 8 bp_sys 12438 non-null int64 9 bp_dias 12438 non-null int64 10 bp_cat 12438 non-null category 11 smoking_status 12438 non-null category 12 HbA1c 12438 non-null float64 13 glucose 12438 non-null int64 14 creatinin 12438 non-null float64 15 albumin 10975 non-null float64 16 alb24h 3520 non-null float64 17 ACR 11226 non-null float64 18 cholesterol_total 12438 non-null float64 19 LDL 12438 non-null float64 20 HDL 12438 non-null float64 21 triglycerides 12438 non-null int64 22 TSH 12438 non-null float64 23 gravity_u 9908 non-null float64 24 nitrites_u 9918 non-null category 25 leuko_u 9931 non-null category 26 proteinuria 9932 non-null category 27 WBC 12438 non-null float64 28 RBC 12438 non-null float64 29 platelets 12438 non-null int64 30 MCV 12438 non-null float64 31 MPV 12438 non-null float64 32 Charlson 12438 non-null category 33 framingham_cvd 12438 non-null float64 34 antidiabetics 12438 non-null category 35 ERD 12438 non-null category 36 CVD 12438 non-null category 37 HTN 12438 non-null category 38 cardiovascular_meds 12438 non-null category 39 statines 12438 non-null category 40 immigrant 12438 non-null category 41 dead_5y 12438 non-null category dtypes: category(18), float64(18), int64(6) memory usage: 2.5 MB
describe = pd.DataFrame
for column in df[num_vars]:
print('IQR:', df[column].quantile(.75) - df[column].quantile(.25))
print(df[column].describe())
print('--------------------')
IQR: 7.0 count 12438.000000 mean 43.501126 std 4.802035 min 30.000000 25% 41.000000 50% 45.000000 75% 48.000000 max 49.000000 Name: age, dtype: float64 -------------------- IQR: 24.0 count 12438.000000 mean 89.837746 std 19.296247 min 41.000000 25% 76.000000 50% 88.000000 75% 100.000000 max 203.000000 Name: weigh, dtype: float64 -------------------- IQR: 15.0 count 12438.000000 mean 167.943906 std 10.068620 min 91.500000 25% 160.000000 50% 168.000000 75% 175.000000 max 202.000000 Name: heigh, dtype: float64 -------------------- IQR: 7.950000000000003 count 12438.000000 mean 31.871922 std 6.459714 min 15.790000 25% 27.400000 50% 31.050000 75% 35.350000 max 95.550000 Name: BMI, dtype: float64 -------------------- IQR: 12.0 count 12438.000000 mean 124.803988 std 13.615760 min 75.000000 25% 118.000000 50% 124.000000 75% 130.000000 max 200.000000 Name: bp_sys, dtype: float64 -------------------- IQR: 10.0 count 12438.000000 mean 77.315163 std 8.915362 min 40.000000 25% 70.000000 50% 79.000000 75% 80.000000 max 135.000000 Name: bp_dias, dtype: float64 -------------------- IQR: 2.2 count 12438.000000 mean 7.623666 std 1.867250 min 4.000000 25% 6.300000 50% 7.100000 75% 8.500000 max 16.800000 Name: HbA1c, dtype: float64 -------------------- IQR: 67.0 count 12438.000000 mean 153.757276 std 64.724782 min 40.000000 25% 110.000000 50% 134.000000 75% 177.000000 max 883.000000 Name: glucose, dtype: float64 -------------------- IQR: 0.24 count 12438.000000 mean 0.751235 std 0.428875 min 0.110000 25% 0.600000 50% 0.710000 75% 0.840000 max 11.150000 Name: creatinin, dtype: float64 -------------------- IQR: 0.4300000000000006 count 10975.000000 mean 4.327445 std 0.329294 min 2.500000 25% 4.100000 50% 4.300000 75% 4.530000 max 5.700000 Name: albumin, dtype: float64 -------------------- IQR: 40.0 count 3520.000000 mean 219.855858 std 4084.373676 min 0.000000 25% 10.000000 50% 17.800000 75% 50.000000 max 170385.000000 Name: alb24h, dtype: float64 -------------------- IQR: 17.13 count 11226.000000 mean 38.829212 std 117.962082 min 0.000000 25% 3.870000 50% 7.000000 75% 21.000000 max 1494.570000 Name: ACR, dtype: float64 -------------------- IQR: 52.0 count 12438.000000 mean 183.736887 std 40.607061 min 68.000000 25% 156.000000 50% 180.000000 75% 208.000000 max 525.000000 Name: cholesterol_total, dtype: float64 -------------------- IQR: 43.10000000000001 count 12438.000000 mean 104.314327 std 33.206564 min 30.000000 25% 81.300000 50% 101.200000 75% 124.400000 max 250.000000 Name: LDL, dtype: float64 -------------------- IQR: 13.0 count 12438.000000 mean 42.296205 std 10.289038 min 11.000000 25% 35.000000 50% 41.000000 75% 48.000000 max 136.000000 Name: HDL, dtype: float64 -------------------- IQR: 122.0 count 12438.000000 mean 199.988583 std 152.421882 min 24.000000 25% 115.000000 50% 166.000000 75% 237.000000 max 3404.000000 Name: triglycerides, dtype: float64 -------------------- IQR: 1.4000000000000004 count 12438.000000 mean 13.801158 std 1.644545 min 10.200000 25% 12.900000 50% 13.500000 75% 14.300000 max 68.900000 Name: TSH, dtype: float64 -------------------- IQR: 0.01200000000000001 count 9908.000000 mean 1.022250 std 0.008226 min 1.000000 25% 1.016000 50% 1.021000 75% 1.028000 max 1.050000 Name: gravity_u, dtype: float64 -------------------- IQR: 2.8999999999999995 count 12438.000000 mean 8.279029 std 2.344549 min 1.300000 25% 6.700000 50% 8.000000 75% 9.600000 max 24.100000 Name: WBC, dtype: float64 -------------------- IQR: 0.6699999999999999 count 12438.000000 mean 4.953782 std 0.516850 min 3.000000 25% 4.600000 50% 4.935000 75% 5.270000 max 7.800000 Name: RBC, dtype: float64 -------------------- IQR: 95.0 count 12438.000000 mean 278.554832 std 77.703854 min 49.000000 25% 226.000000 50% 268.000000 75% 321.000000 max 808.000000 Name: platelets, dtype: float64 -------------------- IQR: 7.0 count 12438.000000 mean 83.439574 std 6.396095 min 53.600000 25% 80.400000 50% 84.000000 75% 87.400000 max 112.700000 Name: MCV, dtype: float64 -------------------- IQR: 1.5999999999999996 count 12438.000000 mean 9.382099 std 1.311013 min 5.500000 25% 8.500000 50% 9.200000 75% 10.100000 max 19.000000 Name: MPV, dtype: float64 -------------------- IQR: 0.10216 count 12438.000000 mean 0.127533 std 0.092716 min 0.007310 25% 0.061800 50% 0.102370 75% 0.163960 max 0.802220 Name: framingham_cvd, dtype: float64 --------------------
for column in df[cat_vars]:
print(df[column].value_counts())
print('--------------------')
1 6607 2 5831 Name: sex, dtype: int64 -------------------- M 8537 U 3901 Name: marital_status, dtype: int64 -------------------- 1 6281 2 3890 3 2267 Name: ses, dtype: int64 -------------------- urban 11659 rural 779 Name: residence, dtype: int64 -------------------- Normal 7496 Pre-HTN 2809 HTN-G1 1691 HTN-G2 367 HTN-G3 75 Name: bp_cat, dtype: int64 -------------------- non_smoker 7806 current_smoker 3287 past_smoker 1345 Name: smoking_status, dtype: int64 -------------------- 0.0 9567 1.0 328 2.0 23 Name: nitrites_u, dtype: int64 -------------------- 0.0 7919 500.0 568 25.0 524 75.0 341 250.0 238 100.0 233 1.0 43 2.0 35 4.0 19 3.0 11 Name: leuko_u, dtype: int64 -------------------- 0.0 9428 1.0 504 Name: proteinuria, dtype: int64 -------------------- 1 6167 2 3465 3 1417 4 506 0 477 5 193 6 122 7 42 10 17 8 15 9 9 12 3 14 3 11 2 Name: Charlson, dtype: int64 -------------------- 1 10668 0 1770 Name: antidiabetics, dtype: int64 -------------------- 0 12356 1 82 Name: ERD, dtype: int64 -------------------- 0 11293 1 1145 Name: CVD, dtype: int64 -------------------- 0 8482 1 3956 Name: HTN, dtype: int64 -------------------- 1 7496 0 4942 Name: cardiovascular_meds, dtype: int64 -------------------- 1 8734 0 3704 Name: statines, dtype: int64 -------------------- 0 10263 1 2175 Name: immigrant, dtype: int64 -------------------- 0 12241 1 197 Name: dead_5y, dtype: int64 --------------------
fig, ax = plt.subplots(num_vars_len, 2, figsize = (15,80))
plt.subplots_adjust(hspace=0.5)
for i, var in enumerate(num_vars):
sns.histplot(data = df, x = var, ax = ax[i,0])
sns.boxplot(data = df, x = var, ax = ax[i,1])
#with PdfPages(r'C:\Users\avi\OneDrive\שולחן העבודה\Assignment in data science\countplots.pdf') as export_pdf:
fig, ax = plt.subplots(cat_vars_len, figsize = (4,50))
plt.subplots_adjust(hspace=1)
for i, var in enumerate(cat_vars):
sns.countplot(data = df, x = var, ax = ax[i])
#export_pdf.savefig()
#plt.close()
tmp = []
for i, var1 in enumerate(num_vars):
for j, var2 in enumerate(num_vars):
if var1 != var2:
tmp.append(list(set([var1,var2])))
tmp.sort()
pairs = [x for i, x in enumerate(tmp) if i%2]
len(pairs)
276
fig, ax = plt.subplots(46, 6, figsize = (25,200))
plt.subplots_adjust(hspace=0.5)
for i, [var1, var2] in enumerate(pairs):
corr, p = spearmanr(df[var1], df[var2], nan_policy='omit')
title = 'corr: ' + str(round(corr,3)) + ' , ' + 'p value: ' + str(round(p,3))
sns.scatterplot(data = df, x = var1, y = var2, ax = ax[i//6,i%6])
ax[i//6,i%6].set_title(title, color = 'blue')
if abs(corr) >= 0.7:
ax[i//6,i%6].set_title(title, backgroundcolor='red')
elif p < 0.05:
ax[i//6,i%6].set_title(title, backgroundcolor='yellow')